In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1918]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected QCOM
In [1919]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1920]:
pd.set_option('display.max_colwidth', None)
In [1921]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1922]:
del df['Unnamed: 0']
In [1923]:
df.head(5)
Out[1923]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-08-12 110.389999 116.250000 109.650002 115.790001 112.602760 18664700 6.395294 0.490839 7.739309 4.697455 115.939637 104.811791 110.375714 NaN 4.056874 7.419998 59.272966 NaN NaN NaN 22.760002 NaN 0.244652 74.017064 NaN NaN 52.826115 41.320446 6.494750e+07 1.098214e+07 88773700.0 0.0 1.346027e+06 0.0 0.0 0.0 0.0 0.0 1.346027e+06 1.346027e+06 0.0 0.0 1.346027e+06 2 133 135 135 0 0 135 135
1 2020-08-13 116.199997 116.199997 112.769997 113.410004 110.288277 9958400 -2.055443 0.628667 8.883965 4.516390 116.688340 104.765947 110.727143 NaN 3.982356 3.430000 59.272966 NaN NaN NaN 6.220001 NaN 0.058028 68.896979 NaN NaN 74.769547 53.044967 5.870540e+07 9.533783e+06 78815300.0 0.0 8.899546e+05 0.0 0.0 0.0 0.0 0.0 8.899546e+05 8.899546e+05 0.0 0.0 8.899546e+05 1 149 150 150 0 0 150 150
2 2020-08-14 113.300003 114.519997 112.570000 113.739998 110.609177 6641000 0.290975 0.579546 10.005249 4.149762 117.389072 104.736642 111.062857 NaN 3.648463 1.949997 56.780601 NaN NaN NaN 8.129997 NaN 0.076981 69.214959 NaN NaN 83.174062 70.256575 6.003360e+07 8.497175e+06 85456300.0 0.0 7.842553e+05 0.0 0.0 0.0 0.0 0.0 7.842553e+05 7.842553e+05 0.0 0.0 7.842553e+05 1 80 81 81 0 0 81 81
3 2020-08-17 113.290001 114.489998 110.629997 112.180000 109.092125 10283000 -1.371547 0.624033 10.157935 4.108368 117.597161 104.848555 111.222858 9.525128 3.662300 3.860001 33.193332 NaN NaN NaN 2.590004 9.424145 0.023634 65.790720 NaN NaN 72.719385 76.887665 5.800898e+07 6.656441e+06 75173300.0 0.0 9.046535e+05 0.0 0.0 0.0 0.0 0.0 9.046535e+05 9.046535e+05 0.0 0.0 9.046535e+05 1 96 97 97 0 0 97 97
4 2020-08-18 112.910004 113.379997 111.519997 112.029999 108.946251 6387000 -0.133715 0.482543 8.696848 3.787172 117.660941 105.864774 111.762858 9.263975 3.380499 1.860001 33.193332 NaN NaN NaN 1.080002 9.094124 0.009734 65.455395 NaN NaN 59.630962 71.841470 5.512454e+07 4.380507e+06 68786300.0 0.0 5.959879e+05 0.0 0.0 0.0 0.0 0.0 5.959879e+05 5.959879e+05 0.0 0.0 5.959879e+05 2 112 114 114 0 0 114 114
In [1924]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       343 non-null    datetime64[ns]
 1   Open                       343 non-null    float64       
 2   High                       343 non-null    float64       
 3   Low                        343 non-null    float64       
 4   Close                      343 non-null    float64       
 5   Adj Close                  343 non-null    float64       
 6   Volume                     343 non-null    int64         
 7   Return                     343 non-null    float64       
 8   Beta                       343 non-null    float64       
 9   Variance                   343 non-null    float64       
 10  AvgTrueRange               343 non-null    float64       
 11  Upperband                  343 non-null    float64       
 12  Lowerband                  343 non-null    float64       
 13  Middleband                 343 non-null    float64       
 14  APO                        340 non-null    float64       
 15  NATR                       343 non-null    float64       
 16  TRANGE                     343 non-null    float64       
 17  DMI                        343 non-null    float64       
 18  MACD                       332 non-null    float64       
 19  MACDSIGNAL                 332 non-null    float64       
 20  MACDHIST                   332 non-null    float64       
 21  MOM                        343 non-null    float64       
 22  PPO                        340 non-null    float64       
 23  ROCP                       343 non-null    float64       
 24  RSI                        343 non-null    float64       
 25  TRIX                       277 non-null    float64       
 26  ULTOSC                     337 non-null    float64       
 27  SLOWK                      343 non-null    float64       
 28  SLOWD                      343 non-null    float64       
 29  AD                         343 non-null    float64       
 30  ADOSC                      343 non-null    float64       
 31  OBV                        343 non-null    float64       
 32  Upward_momentum_created    343 non-null    float64       
 33  Downward_momentum_created  343 non-null    float64       
 34  B5_O_Um                    343 non-null    float64       
 35  B5_C_Um                    343 non-null    float64       
 36  B5_E_Um                    343 non-null    float64       
 37  B5_A_Um                    343 non-null    float64       
 38  B5_N_Um                    343 non-null    float64       
 39  B5_O_Dm                    343 non-null    float64       
 40  B5_C_Dm                    343 non-null    float64       
 41  B5_E_Dm                    343 non-null    float64       
 42  B5_A_Dm                    343 non-null    float64       
 43  B5_N_Dm                    343 non-null    float64       
 44  Verified_status_True       343 non-null    int64         
 45  Verified_status_False      343 non-null    int64         
 46  O                          343 non-null    int64         
 47  C                          343 non-null    int64         
 48  E                          343 non-null    int64         
 49  A                          343 non-null    int64         
 50  N                          343 non-null    int64         
 51  Real_or_Fake_tweet         343 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 139.5 KB
In [1925]:
df.shape
Out[1925]:
(343, 52)
In [1926]:
sns.set(font_scale=0.8)
In [1927]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1928]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1929]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1930]:
df.head()
Out[1930]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-08-12 110.389999 116.250000 109.650002 115.790001 112.602760 18664700 6.395294 0.490839 7.739309 4.697455 115.939637 104.811791 110.375714 NaN 4.056874 7.419998 59.272966 NaN NaN NaN 22.760002 NaN 0.244652 74.017064 NaN NaN 52.826115 41.320446 6.494750e+07 1.098214e+07 88773700.0 0.0 1.346027e+06 0.0 0.0 0.0 0.0 0.0 1.346027e+06 1.346027e+06 0.0 0.0 1.346027e+06 2 133 135 135 0 0 135 135 NaN NaN
1 2020-08-13 116.199997 116.199997 112.769997 113.410004 110.288277 9958400 -2.055443 0.628667 8.883965 4.516390 116.688340 104.765947 110.727143 NaN 3.982356 3.430000 59.272966 NaN NaN NaN 6.220001 NaN 0.058028 68.896979 NaN NaN 74.769547 53.044967 5.870540e+07 9.533783e+06 78815300.0 0.0 8.899546e+05 0.0 0.0 0.0 0.0 0.0 8.899546e+05 8.899546e+05 0.0 0.0 8.899546e+05 1 149 150 150 0 0 150 150 -2.055443 -0.020769
2 2020-08-14 113.300003 114.519997 112.570000 113.739998 110.609177 6641000 0.290975 0.579546 10.005249 4.149762 117.389072 104.736642 111.062857 NaN 3.648463 1.949997 56.780601 NaN NaN NaN 8.129997 NaN 0.076981 69.214959 NaN NaN 83.174062 70.256575 6.003360e+07 8.497175e+06 85456300.0 0.0 7.842553e+05 0.0 0.0 0.0 0.0 0.0 7.842553e+05 7.842553e+05 0.0 0.0 7.842553e+05 1 80 81 81 0 0 81 81 0.290975 0.002906
3 2020-08-17 113.290001 114.489998 110.629997 112.180000 109.092125 10283000 -1.371547 0.624033 10.157935 4.108368 117.597161 104.848555 111.222858 9.525128 3.662300 3.860001 33.193332 NaN NaN NaN 2.590004 9.424145 0.023634 65.790720 NaN NaN 72.719385 76.887665 5.800898e+07 6.656441e+06 75173300.0 0.0 9.046535e+05 0.0 0.0 0.0 0.0 0.0 9.046535e+05 9.046535e+05 0.0 0.0 9.046535e+05 1 96 97 97 0 0 97 97 -1.371547 -0.013810
4 2020-08-18 112.910004 113.379997 111.519997 112.029999 108.946251 6387000 -0.133715 0.482543 8.696848 3.787172 117.660941 105.864774 111.762858 9.263975 3.380499 1.860001 33.193332 NaN NaN NaN 1.080002 9.094124 0.009734 65.455395 NaN NaN 59.630962 71.841470 5.512454e+07 4.380507e+06 68786300.0 0.0 5.959879e+05 0.0 0.0 0.0 0.0 0.0 5.959879e+05 5.959879e+05 0.0 0.0 5.959879e+05 2 112 114 114 0 0 114 114 -0.133715 -0.001338
In [1931]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1932]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1933]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1934]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1935]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1936]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1937]:
df.describe()
Out[1937]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 277.000000 277.000000 277.000000 277.000000 277.000000 2.770000e+02 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 2.770000e+02 2.770000e+02 2.770000e+02 277.0 2.770000e+02 277.0 277.0 277.0 277.0 277.0 2.770000e+02 2.770000e+02 277.0 277.0 2.770000e+02 277.000000 277.00000 277.000000 277.000000 277.0 277.0 277.000000 277.000000 277.000000 277.000000 248.000000 241.000000
mean 144.799061 146.570433 142.963899 144.736823 143.187647 9.144496e+06 0.097236 0.759062 11.314851 3.959124 149.775163 138.928880 144.352022 1.301098 2.706196 3.984080 31.577187 1.136159 1.106099 0.030060 1.632491 0.853211 0.012893 52.528268 0.094410 50.349848 52.569869 52.738678 3.518285e+07 -3.140882e+04 1.091495e+08 0.0 1.308860e+06 0.0 0.0 0.0 0.0 0.0 1.308860e+06 1.308860e+06 0.0 0.0 1.308860e+06 2.859206 125.67870 128.537906 128.537906 0.0 0.0 128.537906 128.537906 0.097236 0.000740 0.020549 0.020536
std 14.536167 15.210822 14.163426 14.696316 14.774596 5.146221e+06 2.162605 0.374180 22.717428 1.294281 15.663937 12.988789 13.824884 5.539392 0.718777 2.565914 22.318213 3.928101 3.691723 1.109468 10.179970 3.749473 0.072433 12.837778 0.227606 9.752911 24.694526 22.645120 2.601053e+07 5.704444e+06 4.579655e+07 0.0 1.825339e+06 0.0 0.0 0.0 0.0 0.0 1.825339e+06 1.825339e+06 0.0 0.0 1.825339e+06 8.030176 127.59115 133.661627 133.661627 0.0 0.0 133.661627 133.661627 2.162605 0.021536 0.006819 0.006917
min 125.199997 125.470001 122.169998 122.949997 121.546822 2.120200e+06 -8.829329 -0.279266 0.253370 2.165666 129.483183 119.370430 126.129999 -8.476987 1.601469 1.000000 0.028564 -6.350793 -5.554278 -2.179980 -20.029999 -5.951638 -0.121556 20.458233 -0.232055 27.096608 4.889775 9.606017 -2.550926e+07 -1.312143e+07 3.175800e+06 0.0 1.470260e+05 0.0 0.0 0.0 0.0 0.0 1.470260e+05 1.470260e+05 0.0 0.0 1.470260e+05 0.000000 23.00000 23.000000 23.000000 0.0 0.0 23.000000 23.000000 -8.829329 -0.092437 0.009435 0.009435
25% 133.750000 135.160004 132.600006 133.880005 132.698029 6.295700e+06 -0.974891 0.559748 2.087043 2.838783 137.526041 130.278026 133.901428 -1.610513 2.079515 2.439995 12.544746 -1.303317 -1.287268 -0.715652 -4.570007 -1.182341 -0.032797 45.349521 -0.099900 43.345568 32.799167 34.391946 1.453209e+07 -3.820875e+06 8.076820e+07 0.0 4.865645e+05 0.0 0.0 0.0 0.0 0.0 4.865645e+05 4.865645e+05 0.0 0.0 4.865645e+05 0.000000 66.00000 67.000000 67.000000 0.0 0.0 67.000000 67.000000 -0.974891 -0.009797 0.013802 0.013766
50% 142.250000 143.020004 140.419998 142.089996 140.620056 7.800300e+06 -0.046668 0.757771 4.911776 3.825050 145.442966 137.698152 141.285716 1.170640 2.601687 3.270004 30.047637 0.522286 0.443203 -0.022545 1.239990 0.839336 0.008491 52.211804 0.057854 50.864910 54.054378 52.697144 3.356608e+07 1.811826e+04 1.060597e+08 0.0 7.637976e+05 0.0 0.0 0.0 0.0 0.0 7.637976e+05 7.637976e+05 0.0 0.0 7.637976e+05 1.000000 88.00000 90.000000 90.000000 0.0 0.0 90.000000 90.000000 -0.046668 -0.000467 0.020965 0.020944
75% 149.199997 151.399994 147.179993 148.789993 146.888046 1.063790e+07 1.219059 0.968026 10.885393 4.737909 153.667396 144.223401 148.418573 3.231217 3.337957 4.669998 45.985557 2.583333 2.545606 0.571263 5.029999 2.320083 0.039095 60.722820 0.313869 57.179383 74.469077 71.660822 5.668715e+07 3.537975e+06 1.439380e+08 0.0 1.373046e+06 0.0 0.0 0.0 0.0 0.0 1.373046e+06 1.373046e+06 0.0 0.0 1.373046e+06 2.000000 131.00000 131.000000 131.000000 0.0 0.0 131.000000 131.000000 1.219059 0.012117 0.026430 0.026476
max 190.300003 192.679993 183.110001 189.279999 189.279999 4.832150e+07 12.731084 2.051422 186.450603 7.988605 194.719772 179.250717 183.867144 20.411796 4.709399 21.100006 95.673632 13.051018 11.324009 4.256930 46.580002 13.494785 0.344450 87.703786 0.538545 76.056898 96.355217 94.746098 8.602813e+07 1.636495e+07 2.521914e+08 0.0 1.473368e+07 0.0 0.0 0.0 0.0 0.0 1.473368e+07 1.473368e+07 0.0 0.0 1.473368e+07 96.000000 1211.00000 1249.000000 1249.000000 0.0 0.0 1249.000000 1249.000000 12.731084 0.119835 0.036029 0.036029
In [1938]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1939]:
df = df.fillna(df.median())
In [1940]:
df.isna().sum()
Out[1940]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1941]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 277 entries, 66 to 342
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       277 non-null    datetime64[ns]
 1   Open                       277 non-null    float64       
 2   High                       277 non-null    float64       
 3   Low                        277 non-null    float64       
 4   Close                      277 non-null    float64       
 5   Adj Close                  277 non-null    float64       
 6   Volume                     277 non-null    int64         
 7   Return                     277 non-null    float64       
 8   Beta                       277 non-null    float64       
 9   Variance                   277 non-null    float64       
 10  AvgTrueRange               277 non-null    float64       
 11  Upperband                  277 non-null    float64       
 12  Lowerband                  277 non-null    float64       
 13  Middleband                 277 non-null    float64       
 14  APO                        277 non-null    float64       
 15  NATR                       277 non-null    float64       
 16  TRANGE                     277 non-null    float64       
 17  DMI                        277 non-null    float64       
 18  MACD                       277 non-null    float64       
 19  MACDSIGNAL                 277 non-null    float64       
 20  MACDHIST                   277 non-null    float64       
 21  MOM                        277 non-null    float64       
 22  PPO                        277 non-null    float64       
 23  ROCP                       277 non-null    float64       
 24  RSI                        277 non-null    float64       
 25  TRIX                       277 non-null    float64       
 26  ULTOSC                     277 non-null    float64       
 27  SLOWK                      277 non-null    float64       
 28  SLOWD                      277 non-null    float64       
 29  AD                         277 non-null    float64       
 30  ADOSC                      277 non-null    float64       
 31  OBV                        277 non-null    float64       
 32  Upward_momentum_created    277 non-null    float64       
 33  Downward_momentum_created  277 non-null    float64       
 34  B5_O_Um                    277 non-null    float64       
 35  B5_C_Um                    277 non-null    float64       
 36  B5_E_Um                    277 non-null    float64       
 37  B5_A_Um                    277 non-null    float64       
 38  B5_N_Um                    277 non-null    float64       
 39  B5_O_Dm                    277 non-null    float64       
 40  B5_C_Dm                    277 non-null    float64       
 41  B5_E_Dm                    277 non-null    float64       
 42  B5_A_Dm                    277 non-null    float64       
 43  B5_N_Dm                    277 non-null    float64       
 44  Verified_status_True       277 non-null    int64         
 45  Verified_status_False      277 non-null    int64         
 46  O                          277 non-null    int64         
 47  C                          277 non-null    int64         
 48  E                          277 non-null    int64         
 49  A                          277 non-null    int64         
 50  N                          277 non-null    int64         
 51  Fake_news                  277 non-null    int64         
 52  returns                    277 non-null    float64       
 53  log_returns                277 non-null    float64       
 54  vol_current                277 non-null    float64       
 55  vol_future                 277 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 123.4 KB
In [1942]:
df.shape
Out[1942]:
(277, 56)
In [1943]:
df=df.dropna()
In [1944]:
df.dtypes
Out[1944]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1945]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1945]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0778a1cb90>
In [1946]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1947]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.936906
vol_current     0.830851
Upperband       0.788924
vol_future      0.760790
High            0.716356
Middleband      0.710541
Open            0.696627
Close           0.690765
Adj Close       0.676613
Low             0.671794
TRANGE          0.626008
OBV             0.613046
Lowerband       0.561148
MACDSIGNAL      0.559198
MACD            0.528492
Volume          0.511560
TRIX            0.506625
Name: AvgTrueRange, dtype: float64
In [1948]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 7 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.936906
vol_current     0.813504
vol_future      0.758667
TRANGE          0.586369
Volume          0.547024
Upperband       0.546809
Name: NATR, dtype: float64
In [1949]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with TRANGE:
TRANGE                       1.000000
Volume                       0.805468
Verified_status_False        0.724640
Fake_news                    0.722074
N                            0.722074
C                            0.722074
O                            0.722074
B5_N_Dm                      0.628767
B5_C_Dm                      0.628767
B5_O_Dm                      0.628767
Downward_momentum_created    0.628767
AvgTrueRange                 0.626008
NATR                         0.586369
Verified_status_True         0.505084
Name: TRANGE, dtype: float64
In [1950]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999190
B5_N_Dm                      0.912847
B5_C_Dm                      0.912847
B5_O_Dm                      0.912847
Downward_momentum_created    0.912847
Volume                       0.797064
Verified_status_True         0.768831
TRANGE                       0.722074
Name: O, dtype: float64
In [1951]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999190
B5_N_Dm                      0.912847
B5_C_Dm                      0.912847
B5_O_Dm                      0.912847
Downward_momentum_created    0.912847
Volume                       0.797064
Verified_status_True         0.768831
TRANGE                       0.722074
Name: C, dtype: float64
In [1952]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1953]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1954]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999190
B5_N_Dm                      0.912847
B5_C_Dm                      0.912847
B5_O_Dm                      0.912847
Downward_momentum_created    0.912847
Volume                       0.797064
Verified_status_True         0.768831
TRANGE                       0.722074
Name: N, dtype: float64
In [1955]:
df.columns
Out[1955]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1956]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1957]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1958]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1959]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1960]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1961]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.912847
N                            0.912847
C                            0.912847
O                            0.912847
Verified_status_False        0.907042
Verified_status_True         0.782305
Volume                       0.734404
TRANGE                       0.628767
Name: B5_O_Dm, dtype: float64
In [1962]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.912847
N                            0.912847
C                            0.912847
O                            0.912847
Verified_status_False        0.907042
Verified_status_True         0.782305
Volume                       0.734404
TRANGE                       0.628767
Name: B5_C_Dm, dtype: float64
In [1963]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1964]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1965]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.912847
N                            0.912847
C                            0.912847
O                            0.912847
Verified_status_False        0.907042
Verified_status_True         0.782305
Volume                       0.734404
TRANGE                       0.628767
Name: B5_N_Dm, dtype: float64
In [1966]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999190
B5_N_Dm                      0.912847
B5_C_Dm                      0.912847
B5_O_Dm                      0.912847
Downward_momentum_created    0.912847
Volume                       0.797064
Verified_status_True         0.768831
TRANGE                       0.722074
Name: Fake_news, dtype: float64
In [1967]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.912847
N                            0.912847
C                            0.912847
O                            0.912847
Verified_status_False        0.907042
Verified_status_True         0.782305
Volume                       0.734404
TRANGE                       0.628767
Name: Downward_momentum_created, dtype: float64
In [1968]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1969]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.782305
B5_C_Dm                      0.782305
B5_O_Dm                      0.782305
Downward_momentum_created    0.782305
Fake_news                    0.768831
N                            0.768831
C                            0.768831
O                            0.768831
Verified_status_False        0.742473
Volume                       0.549319
TRANGE                       0.505084
Name: Verified_status_True, dtype: float64
In [1970]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999190
N                            0.999190
C                            0.999190
O                            0.999190
B5_N_Dm                      0.907042
B5_C_Dm                      0.907042
B5_O_Dm                      0.907042
Downward_momentum_created    0.907042
Volume                       0.800414
Verified_status_True         0.742473
TRANGE                       0.724640
Name: Verified_status_False, dtype: float64
In [1971]:
sns.set(font_scale=0.8)
In [1972]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1973]:
df.dtypes
Out[1973]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1974]:
df.isnull().sum()
Out[1974]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1975]:
df.fillna(0, inplace = True)
In [1976]:
df.dropna(inplace=True)
In [1977]:
sns.set(font_scale=0.8)
In [1978]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1979]:
df.describe()
Out[1979]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 277.000000 277.000000 277.000000 277.000000 277.000000 2.770000e+02 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000 2.770000e+02 2.770000e+02 2.770000e+02 277.0 2.770000e+02 277.0 277.0 277.0 277.0 277.0 2.770000e+02 2.770000e+02 277.0 277.0 2.770000e+02 277.000000 277.00000 277.000000 277.000000 277.0 277.0 277.000000 277.000000 277.000000 277.000000 277.000000 277.000000
mean 144.799061 146.570433 142.963899 144.736823 143.187647 9.144496e+06 0.097236 0.759062 11.314851 3.959124 149.775163 138.928880 144.352022 1.301098 2.706196 3.984080 31.577187 1.136159 1.106099 0.030060 1.632491 0.853211 0.012893 52.528268 0.094410 50.349848 52.569869 52.738678 3.518285e+07 -3.140882e+04 1.091495e+08 0.0 1.308860e+06 0.0 0.0 0.0 0.0 0.0 1.308860e+06 1.308860e+06 0.0 0.0 1.308860e+06 2.859206 125.67870 128.537906 128.537906 0.0 0.0 128.537906 128.537906 0.097236 0.000740 0.020593 0.020589
std 14.536167 15.210822 14.163426 14.696316 14.774596 5.146221e+06 2.162605 0.374180 22.717428 1.294281 15.663937 12.988789 13.824884 5.539392 0.718777 2.565914 22.318213 3.928101 3.691723 1.109468 10.179970 3.749473 0.072433 12.837778 0.227606 9.752911 24.694526 22.645120 2.601053e+07 5.704444e+06 4.579655e+07 0.0 1.825339e+06 0.0 0.0 0.0 0.0 0.0 1.825339e+06 1.825339e+06 0.0 0.0 1.825339e+06 8.030176 127.59115 133.661627 133.661627 0.0 0.0 133.661627 133.661627 2.162605 0.021536 0.006452 0.006452
min 125.199997 125.470001 122.169998 122.949997 121.546822 2.120200e+06 -8.829329 -0.279266 0.253370 2.165666 129.483183 119.370430 126.129999 -8.476987 1.601469 1.000000 0.028564 -6.350793 -5.554278 -2.179980 -20.029999 -5.951638 -0.121556 20.458233 -0.232055 27.096608 4.889775 9.606017 -2.550926e+07 -1.312143e+07 3.175800e+06 0.0 1.470260e+05 0.0 0.0 0.0 0.0 0.0 1.470260e+05 1.470260e+05 0.0 0.0 1.470260e+05 0.000000 23.00000 23.000000 23.000000 0.0 0.0 23.000000 23.000000 -8.829329 -0.092437 0.009435 0.009435
25% 133.750000 135.160004 132.600006 133.880005 132.698029 6.295700e+06 -0.974891 0.559748 2.087043 2.838783 137.526041 130.278026 133.901428 -1.610513 2.079515 2.439995 12.544746 -1.303317 -1.287268 -0.715652 -4.570007 -1.182341 -0.032797 45.349521 -0.099900 43.345568 32.799167 34.391946 1.453209e+07 -3.820875e+06 8.076820e+07 0.0 4.865645e+05 0.0 0.0 0.0 0.0 0.0 4.865645e+05 4.865645e+05 0.0 0.0 4.865645e+05 0.000000 66.00000 67.000000 67.000000 0.0 0.0 67.000000 67.000000 -0.974891 -0.009797 0.015007 0.015007
50% 142.250000 143.020004 140.419998 142.089996 140.620056 7.800300e+06 -0.046668 0.757771 4.911776 3.825050 145.442966 137.698152 141.285716 1.170640 2.601687 3.270004 30.047637 0.522286 0.443203 -0.022545 1.239990 0.839336 0.008491 52.211804 0.057854 50.864910 54.054378 52.697144 3.356608e+07 1.811826e+04 1.060597e+08 0.0 7.637976e+05 0.0 0.0 0.0 0.0 0.0 7.637976e+05 7.637976e+05 0.0 0.0 7.637976e+05 1.000000 88.00000 90.000000 90.000000 0.0 0.0 90.000000 90.000000 -0.046668 -0.000467 0.020965 0.020944
75% 149.199997 151.399994 147.179993 148.789993 146.888046 1.063790e+07 1.219059 0.968026 10.885393 4.737909 153.667396 144.223401 148.418573 3.231217 3.337957 4.669998 45.985557 2.583333 2.545606 0.571263 5.029999 2.320083 0.039095 60.722820 0.313869 57.179383 74.469077 71.660822 5.668715e+07 3.537975e+06 1.439380e+08 0.0 1.373046e+06 0.0 0.0 0.0 0.0 0.0 1.373046e+06 1.373046e+06 0.0 0.0 1.373046e+06 2.000000 131.00000 131.000000 131.000000 0.0 0.0 131.000000 131.000000 1.219059 0.012117 0.025812 0.025812
max 190.300003 192.679993 183.110001 189.279999 189.279999 4.832150e+07 12.731084 2.051422 186.450603 7.988605 194.719772 179.250717 183.867144 20.411796 4.709399 21.100006 95.673632 13.051018 11.324009 4.256930 46.580002 13.494785 0.344450 87.703786 0.538545 76.056898 96.355217 94.746098 8.602813e+07 1.636495e+07 2.521914e+08 0.0 1.473368e+07 0.0 0.0 0.0 0.0 0.0 1.473368e+07 1.473368e+07 0.0 0.0 1.473368e+07 96.000000 1211.00000 1249.000000 1249.000000 0.0 0.0 1249.000000 1249.000000 12.731084 0.119835 0.036029 0.036029
In [1980]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1981]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1982]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1983]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected QCOM
In [1984]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1985]:
df.columns
Out[1985]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1986]:
df.shape
Out[1986]:
(343, 52)
In [1987]:
df.isnull().sum()
Out[1987]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           3
NATR                          0
TRANGE                        0
DMI                           0
MACD                         11
MACDSIGNAL                   11
MACDHIST                     11
MOM                           0
PPO                           3
ROCP                          0
RSI                           0
TRIX                         66
ULTOSC                        6
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1988]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1989]:
df_weekly = df.resample('W').agg('mean')
In [1990]:
df_weekly.shape
Out[1990]:
(72, 51)
In [1991]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1991]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077313a4d0>
In [1992]:
sns.set(font_scale=0.8)
In [1993]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1994]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.880465
TRANGE          0.738215
Upperband       0.686851
High            0.631957
Open            0.614825
Close           0.614126
Middleband      0.609405
Low             0.596961
Adj Close       0.596197
OBV             0.587862
Variance        0.540119
MACDSIGNAL      0.537387
TRIX            0.535945
Volume          0.530629
MACD            0.510029
Name: AvgTrueRange, dtype: float64
In [1995]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.880465
TRANGE          0.635359
Volume          0.513530
Name: NATR, dtype: float64
In [1996]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with TRANGE:
TRANGE                       1.000000
Volume                       0.798480
AvgTrueRange                 0.738215
Fake_news                    0.727466
C                            0.727466
O                            0.727466
N                            0.727466
Verified_status_False        0.724737
Downward_momentum_created    0.673423
B5_O_Dm                      0.673423
B5_C_Dm                      0.673423
B5_N_Dm                      0.673423
Verified_status_True         0.672297
NATR                         0.635359
Upperband                    0.520842
High                         0.512193
OBV                          0.505688
Name: TRANGE, dtype: float64
In [1997]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999657
B5_N_Dm                      0.889328
B5_C_Dm                      0.889328
B5_O_Dm                      0.889328
Downward_momentum_created    0.889328
Verified_status_True         0.857056
Volume                       0.800653
TRANGE                       0.727466
Name: O, dtype: float64
In [1998]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999657
B5_N_Dm                      0.889328
B5_C_Dm                      0.889328
B5_O_Dm                      0.889328
Downward_momentum_created    0.889328
Verified_status_True         0.857056
Volume                       0.800653
TRANGE                       0.727466
Name: C, dtype: float64
In [1999]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [2000]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [2001]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999657
B5_N_Dm                      0.889328
B5_C_Dm                      0.889328
B5_O_Dm                      0.889328
Downward_momentum_created    0.889328
Verified_status_True         0.857056
Volume                       0.800653
TRANGE                       0.727466
Name: N, dtype: float64
In [2002]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2003]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2004]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2005]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2006]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2007]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.889328
N                            0.889328
C                            0.889328
O                            0.889328
Verified_status_False        0.883172
Verified_status_True         0.877401
Volume                       0.732760
TRANGE                       0.673423
Return                       0.502058
Name: B5_O_Dm, dtype: float64
In [2008]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.889328
N                            0.889328
C                            0.889328
O                            0.889328
Verified_status_False        0.883172
Verified_status_True         0.877401
Volume                       0.732760
TRANGE                       0.673423
Return                       0.502058
Name: B5_C_Dm, dtype: float64
In [2009]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [2010]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [2011]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.889328
N                            0.889328
C                            0.889328
O                            0.889328
Verified_status_False        0.883172
Verified_status_True         0.877401
Volume                       0.732760
TRANGE                       0.673423
Return                       0.502058
Name: B5_N_Dm, dtype: float64
In [2012]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999657
B5_N_Dm                      0.889328
B5_C_Dm                      0.889328
B5_O_Dm                      0.889328
Downward_momentum_created    0.889328
Verified_status_True         0.857056
Volume                       0.800653
TRANGE                       0.727466
Name: Fake_news, dtype: float64
In [2013]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.889328
N                            0.889328
C                            0.889328
O                            0.889328
Verified_status_False        0.883172
Verified_status_True         0.877401
Volume                       0.732760
TRANGE                       0.673423
Return                       0.502058
Name: Downward_momentum_created, dtype: float64
In [2014]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2015]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.877401
B5_C_Dm                      0.877401
B5_O_Dm                      0.877401
Downward_momentum_created    0.877401
Fake_news                    0.857056
N                            0.857056
C                            0.857056
O                            0.857056
Verified_status_False        0.843278
Volume                       0.722134
TRANGE                       0.672297
Name: Verified_status_True, dtype: float64
In [2016]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999657
N                            0.999657
C                            0.999657
O                            0.999657
B5_N_Dm                      0.883172
B5_C_Dm                      0.883172
B5_O_Dm                      0.883172
Downward_momentum_created    0.883172
Verified_status_True         0.843278
Volume                       0.798554
TRANGE                       0.724737
Name: Verified_status_False, dtype: float64
In [2017]:
sns.set(font_scale=0.8)
In [2018]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [2019]:
df_weekly.fillna(0, inplace = True)
In [2020]:
df_weekly.dropna(inplace=True)
In [2021]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [2022]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();